import pandas as pd
from pandas import *
from IPython.display import HTML, display_html
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels as sm
import nvd3
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100
nvd3.ipynb.initialize_javascript(use_remote=True)
All the data are stored in the data folder and let's take a look.
!ls -l -h data
The file LCDataDictionary.xlsx is the lookup table for finding the corresponding explanation of each column of the data.
data_dict = read_excel('data/LCDataDictionary.xlsx')
HTML(data_dict.to_html())
Let's start by loading the raw data.
df_loan = read_csv('data/loan.csv')
df_loan.info()
There are 887379 observations in total.
It is noticed there are quite a lot of variables containing missing values. Later we may need to dig deeper about why those variables contain missing values and how they would affect the analysis, and whether we need to put more effort to interpolate the values (or drop for good) in order to make proper prediction.
There are 74 variables in total:
Let's take a bit pre-processing on the data types and convert the categorical variables into categorical column in the dataframe.
# identity variables
id_vars = ['id', 'member_id', 'policy_code', 'url']
# date variables
date_vars = filter(lambda x: x[-2:] == '_d', df_loan.columns.tolist())
date_vars += ['earliest_cr_line']
# categorical variables
cat_vars = df_loan.dtypes[df_loan.dtypes.astype(str).isin(['object', 'category'])].index.tolist()
cat_vars = list(set(cat_vars).difference(set(id_vars + date_vars)))
cat_vars = sorted(cat_vars)
# numeric variables
num_vars = list(set(df_loan.columns.tolist()).difference(set(id_vars + date_vars + cat_vars)))
num_vars = sorted(num_vars)
# convert identity and categorical vairables into category dtype in dataframe.
for col in cat_vars + id_vars:
df_loan[col] = df_loan[col].astype('category')
# convert date vairables into datetime dtype in dataframe.
for col in date_vars:
df_loan[col] = to_datetime(df_loan[col], format='%b-%Y')
Now let's profile the dataset.
df_loan.describe(include='all').T.join(DataFrame(df_loan.dtypes, columns=['dtype']))
As a beginning point, we may start by looking at the distribution of each unique variable by plotting the histogram. Let's start by exploring the numeric variables.
We have 48 variables to plot. Let's plot 10 at a time.
# first 10
for i, col in enumerate(num_vars[:10]):
print df_loan[col].describe()
df_loan[col].hist(figsize=(4,2))
plt.title('histogram of ' + col)
plt.show()
print ''
print ''
The distributions of some variables such as total_pymnt, total_acc, mths_since_last_delinq seem to be well shaped, but a couple of the first 10 including acc_now_delinq, annual_inc, collection_recovery_fee seem to have extreme outliers as their medians and 25% percentiles are close to 0 and most of the samples are attributed to only one bin.
Let's remove the the top and bottom 1% samples (the outliers) and re-plot the histograms.
# helper function to remove top/bottom n% outliers
def remove_outliers(df, col, n):
df = df[~df[col].isnull()]
df = df[ ( df[col] > np.percentile(df[col], n) ) &\
( df[col] < np.percentile(df[col], 100-n) ) ]
return df
# helper function to plot the histogram
def plot_hist(df, col, no_outliers=True, describe=True, figsize=(4, 2), **kwargs):
df = df[~df[col].isnull()]
if no_outliers:
df = remove_outliers(df, col, 1)
if describe:
print df[col].describe()
if df[col].shape[0] == 0:
print 'All records have been filtered. Plot not displayed'
else:
df[col].hist(figsize=figsize, **kwargs)
plt.title('histogram of ' + col)
plt.show()
print ''
print ''
# iterate through the first 10 again
for col in num_vars[:10]:
plot_hist(df_loan, col, True)
Well, now the distribution looks much better after getting rid of the extreme outliers.
By looking at the histograms, we can see some patterns:
# Close look at the actual values for Case # 5 concluded above.
print 'acc_now_delinq'
print df_loan['acc_now_delinq'].value_counts()
print ''
print ''
print 'collections_12_mths_ex_med'
print df_loan['collections_12_mths_ex_med'].value_counts()
#np.log(df_loan['collections_12_mths_ex_med'].value_counts()).plot(kind='bar')
The above two still seem to follow an exponential decreasing pattern. Let's plot them on log scale and see what happens.
# Close look at the actual values for Case # 5 concluded above.
plt.figure(figsize=(10, 4))
key = 'acc_now_delinq'
plt.subplot(121)
df_loan[key].value_counts().plot(logy=True)
plt.title('distribution of ' + key)
plt.subplot(122)
key = 'collections_12_mths_ex_med'
df_loan[key].value_counts().plot(logy=True)
plt.title('distribution of ' + key)
plt.show()
The trend is clearly reflected. I wonder these two may affect the following analysis. Will they be useful indicators of a prediction target? Let's leave it for now.
We can now plot the rest of the variables, still, filtering out the top/bottom 1%.
for col in num_vars[10:]:
plot_hist(df_loan, col, True)
Nothing really surprising here as most of the variables can fall into the 5 categories of distributions described previously. Yet apparently pub_rec seems to be a boolean variable as it only contains 2 elements, and total_rec_late_fee (though very small sample size, ~1.3k) is mostly close to 14.99.
# helper function to plot categorical variables
def plot_cat_var(df, col):
n = 20
_df = DataFrame(df[col].value_counts()).rename(columns={col: 'count'})
_df['pct'] = _df['count'].astype(float) / _df['count'].sum()
_df['cum_pct'] = _df['pct'].cumsum()
print 'variable: ' + col
print _df[:n] # top n
_df[:n]['count'].plot(kind='bar', title='histogram of ' + col, figsize=(8, 4))
# _df['pct'].plot(kind='bar', title='histogram of ' + col, figsize=(4, 2))
plt.show()
print ''
print ''
# iterate through each categorical variables
for col in cat_vars:
plot_cat_var(df_loan, col)
Let's plot the time series by each date variable.
# helper function to plot categorical variables
def plot_date_var(df, col):
_df = DataFrame(df[col].value_counts()).rename(columns={col:'count'})
print 'variable: ' + col
# print _df
_df['count'].plot(title='line chart of ' + col, figsize=(8, 4))
plt.show()
print ''
print ''
# iterate through each date variable
for col in date_vars:
plot_date_var(df_loan, col)
As the first line chart (issue_d) shows, the number of loans increases steadily since 2007 and throughout the next 9 years. However, the trend has been quite volatile the middle of 2014.
The time series chart for other 3 really doesn't make so much sense simply by visuliasing it by its date scale, as its status is dynamic depending on how recent it is.
After performing the univariate analysis, we have been able to gain a fundamental understanding of the dataset. Now it is time to investigate how each variable correlates with each other.
Yet, it is nice to look at all the combinations of each variable. However, we have 48 numeric variables and 17 categorical variables, and checking each combination of two out of the dataset will be quite expensive effort. C^2_48 = 1128 combinations!
Therefore, we have to do some preprocessing of the dataset.
I wonder how the variables correlate with each other. Is there any relationship between two variables? How one variable affects another?
We start by plotting a correlation matrix for numeric variables.
# sample size
n = 100000
# ignore the joint account related variables as there are too many missing values
_vars = list(set(num_vars).difference(set(['annual_inc_joint', 'dti_joint'])))
# get the correlation matrix from the variables
# df_corr = df_loan.sample(n).ix[:, sorted(_vars)].corr()
df_corr = df_loan.ix[:, sorted(_vars)].corr()
# filter out the variables by removing redundent variables
vars_red = df_corr[(df_corr.abs() > 0.7) & (df_corr != 1.0)].dropna(how='all').index.tolist() # redundent variables
vars_red2 = [
'revol_bal',
'total_rev_hi_lim',
'recoveries',
'open_il_24m',
'installment',
'open_rv_24m',
'last_pymnt_amnt',
'loan_amnt',
'total_pymnt_inv',
'total_rec_prncp',
'out_prncp'] # vars_red2 are manually selected by performing analysis in following section
vars_no_red = list(set(_vars).difference(vars_red)) + vars_red2 # non-redundent variables
_vars2 = sorted(vars_no_red)
df_corr2 = df_corr.ix[_vars2, _vars2]
# plot the heatmap of correlation matrix
plt.figure(figsize=(18, 18))
plt.title('correlation matrix')
sns.heatmap(df_corr2.fillna(0),
# annot=True,
cmap='RdBu_r', vmin=-1., vmax=1.)
sns.plt.show()
Looking at the whole correlation matrix will be quite overwhelming. Therefore we apply PCA on it in order to remove redundent variables that are highly correlated with each other.
Below analysis follows the tutorial about feature selection from on R-bloggers.
def pca_analysis(df_corr, vars, title=None):
# apply PCA to find out the redundent variables
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
_df = DataFrame(pca.fit_transform(df_corr.fillna(0).ix[vars, vars])).rename(columns={0: 'x', 1: 'y'})
_df['radius'] = (_df.x**2 + _df.y**2)**0.5
_df['label'] = vars
display_html(HTML(_df.sort_values('radius').to_html()))
plt.figure(figsize=(10, 10))
plt.scatter(_df.x, _df.y, alpha=1, marker='o')
if title is not None:
plt.title(title)
for _x, _y, _l in zip(_df.x, _df.y, _df.label):
plt.annotate(_l, (_x, _y))
plt.show()
pca_analysis(df_corr=df_corr, vars=vars_red, title='PCA Analysis for Redundent Variables Correlations')
By looking at the visualization from PCA results above, we can conclude the redundent variables can be categorised into following groups:
Then we choose one from each group which contribute to variable vars_red2 (redundent variables).
# same analysis for all numeric variables
pca_analysis(df_corr=df_corr, vars=num_vars2, title='PCA Analysis of All Variables')
# re-assign numeric variables after removing the redundent
num_vars2 = sorted(vars_no_red)
print "Number of numeric variables before/after removing redundent"
print "Before: {}".format(len(num_vars))
print "After: {}".format(len(num_vars2))
Now I am able to determine the target variables I will be looking at in depth.
# target variables
target_vars = [
'int_rate',
'dti',
'loan_amnt',
'installment',
'annual_inc',
'out_prncp',
'open_acc',
'last_pymnt_amnt',
]
Let's start by investigating the interest rate (int_rate). Previously, the basic univariate analysis indicates
I wonder how the histogram looks like when we increase the number of bins.
# helper function to plot both histogram and kde plot
def plot_hist_kde(df, col, no_outliers=True, sample=None, figsize=(8, 4), bins=None, **kwargs):
if no_outliers:
df = remove_outliers(df, col, 1)
if sample is not None:
df = df.sample(sample)
# histogram
fig, ax1 = plt.subplots(figsize=figsize)
ax1.hist(df[col].dropna().tolist(), bins=bins)
ax1.set_ylabel('count')
# kde plot
ax2 = ax1.twinx()
sns.kdeplot(df[col], ax=ax2, color='g')
ax2.set_ylabel('kernel density', color='g')
for t2 in ax2.get_yticklabels():
t2.set_color('g')
plt.show()
plot_hist_kde(df_loan, 'int_rate', True, sample=df_loan.shape[0] / 20, figsize=(8, 4), bins=50)
Interest rate seems to be a multi-modal distribution peaking around 7.5, 12.5 and 17.5. If we look closely, it appears there are three more hidden peaks around 11, 16 and 24 as well. This implies it could be shaped by 6 individual gaussian disributions with different params. Let's find out by plotting the joint plots with other numeric variables and boxplots with other categorical variables.
First let's make some helper functions for the plottings below.
######################
# numeric variables #
######################
# helper function to plot bivariate kde/scatter/hex plot between two variables
def plot_scatter(df, x_col, y_col, sample=None, sample_frac=None, no_outliers=True, kind='kde', **kwargs):
if no_outliers:
df = remove_outliers(df, x_col, n=1)
df = remove_outliers(df, y_col, n=1)
if df.shape[0] == 0:
return
if sample is not None:
df = df.sample(min(sample, df.shape[0]))
if sample_frac is not None:
df = df.sample(frac=sample_frac)
print "{} vs {}".format(y_col, x_col)
plt.figure(figsize=(4, 4))
sns.jointplot(data=df, x=x_col, y=y_col, kind=kind)
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.show()
print ''
print ''
# helper function to plot bivariate kde/scatter/hex plot between target variable and other variables
def plot_bivar_joint_plots(df, target_var, other_vars, sample=None, sample_frac=50, kind='kde', **kwargs):
if sample is None:
# take 1/50 of total dataset as sample.
# For performance consideration, do not take too large sample size if joint plot kind is set to 'kde'
sample = df.shape[0] / 50
print """
################### {} ##################
""".format(target_var)
for col in other_vars:
if col == target_var:
continue
plot_scatter(df, target_var, col, sample=sample, sample_frac=sample_frac, kind=kind, **kwargs)
##########################
# categorical variables #
##########################
def plot_box(df, col, cat, sample=None, describe=True, no_outliers=True, **kwargs):
print "{} by {}".format(col, cat)
if no_outliers:
df = remove_outliers(df, col, 1)
if sample is not None:
df = df.sample(sample)
if df[cat].cat.categories.shape[0] > 50:
print "Cardinality too large to plot. Cardinality {}. Not displayed.".format(df[cat].cat.categories.shape[0])
elif df.shape[0] == 0:
print "No data. Not displayed."
else:
if describe:
html = df.groupby(cat, as_index=1)[col].describe().reset_index()\
.pivot_table(index=cat, columns='level_1', values=col).to_html()
display_html(html, raw=True)
df.boxplot(column=[col], by=[cat], **kwargs)
plt.show()
print ''
print ''
def plot_bivar_box_plots(df, target_var, other_vars, sample=None, describe=True, **kwargs):
for var in other_vars:
plot_box(df_loan, target_var, var, sample=None, vert=0, describe=describe, **kwargs)
#################
# all together #
#################
def plot_bivar_all(df, target_var, num_vars, cat_vars, sample=None, sample_frac=None, describe=True, joint_kind='kde', **kwargs):
plot_bivar_joint_plots(df, target_var, other_vars=num_vars, sample=sample, sample_frac=None, kind=joint_kind, **kwargs)
plot_bivar_box_plots(df, target_var, other_vars=cat_vars, sample=sample, describe=describe, **kwargs)
plot_bivar_all(df_loan, target_var='int_rate', num_vars=num_vars2, cat_vars=cat_vars)
Conclusions drawn from above:
The finding of the relation between (sub)grade and interest rate could explain the multi-modal distribution question we found out previously. Let's plot the kde plot by grade.
# KDE Plot of Interest Rate by Loan Grade
target_var = 'int_rate'
cat_var = 'grade'
plt.figure(figsize=(16, 4))
for key in df_loan[cat_var].cat.categories.tolist():
sns.kdeplot(df_loan[df_loan[cat_var] == key][target_var], bw=0.3, label=key)
# plt.hist(df_loan[target_var], bins=75, normed=True)
plt.xlabel(target_var)
plt.title('KDE Plot of Interest Rate by Loan Grade')
plt.show()
col = 'int_rate'
cat = 'grade'
# debugging
df = df_loan
display_html(df.groupby(cat, as_index=1)[col].describe().reset_index().pivot_table(index=cat, columns='level_1', values=col).to_html(), raw=True)
cat = ['grade', 'sub_grade']
display_html(df.groupby(cat, as_index=1)[col].describe().reset_index()\
.pivot_table(index=cat, columns='level_2', values=col).dropna().to_html(), raw=True)
Now let's encode sub_grade and look at its correlation with int_rate.
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
# encoded sub_grade
df['sub_grade_enc'] = enc.fit_transform(df['sub_grade'])
plot_scatter(df_loan, 'int_rate', 'sub_grade_enc', sample=df_loan.shape[0] / 50)
We get 0.97 of Pearson Correlation which indicates a strong correlation between sub_grade and int_rate. We can also get 0.94 R-squares when applying a linear model of the two variables.
# Engineering new variables
today = df_loan['issue_d'].max()
df_loan['mths_to_date'] = ((today - df_loan['issue_d']).dt.days / 30).astype(int)
Let's investigate dti. Again we plot the histogram, kdeplot, jointplots and boxplots.
df_loan[['dti']].describe().T
plot_hist_kde(df_loan, col='dti', no_outliers=True, bins=30)
It seems dti follows bell-shape, slightly left-skewed. This implies its distribution is hardly separable and not influenced by other variables. Can we assume debt-to-income ratio is stable across other variables?
plot_bivar_all(df_loan, target_var='dti', num_vars=num_vars2, cat_vars=cat_vars)
We don't really get much strong correlation accross the
Conclusions:
plot_hist_kde(df_loan, col='loan_amnt', no_outliers=False, bins=50)
The distribution has peaks aound some values around some excact numbers, including, 5k, 10k, 15k, 20k, 25k, 30k, 25k.
df_loan.loan_amnt.value_counts().head(10)
plot_bivar_all(df_loan, 'loan_amnt', num_vars2, cat_vars, sample=df_loan.shape[0] / 50)
# mask = df_loan[df_loan.loan_status != False]
df_loan.ix[:, ['loan_status', 'loan_amnt', 'out_prncp', 'total_rec_prncp', 'recoveries', 'collection_recovery_fee']].head(100)
df_corr2.ix[['loan_amnt']].T.rename(columns={'loan_amnt': 'corr'}).sort('corr', ascending=0)[:10]
We could draw conclusions from above analysis.
We care about if a loan will be full paid back with no defaults. Let's see what the loan status looks like when comparing across other variables.
"""
Charged Off 8000.0 12000.0 19200.00 43040.0 34975.0 13874.283806 1850.0 7403.217333
Current 8500.0 13200.0 20000.00 569677.0 34975.0 14442.792802 1825.0 7447.908997
Default 9262.5 13000.0 19418.75 1150.0 34875.0 14386.000000 2000.0 7166.756103
Does not meet the credit policy. Status:Charged Off 5000.0 8000.0 13100.00 727.0 25000.0 9913.686382 1925.0 6291.758303
Does not meet the credit policy. Status:Fully Paid 4800.0 7500.0 12112.50 1883.0 25000.0 9274.575146 1825.0 6061.028667
Fully Paid 7200.0 11625.0 17600.00 198379.0 34975.0 12846.627037 1825.0 7118.170696
In Grace Period 9325.0 14025.0 20000.00 5875.0 34625.0 14938.140426 1925.0 7521.982192
Issued 8000.0 13000.0 20000.00 7924.0 34925.0 14356.985109 1825.0 7709.116346
Late (16-30 days) 8837.5 13950.0 20000.00 2211.0 34550.0 14668.374039 2000.0 7614.741313
Late (31-120 days) 9000.0 13500.0 20000.00 10929.0 34975.0 14611.030286 1900.0 7367.577507
"""
# pre-processing
def catogorise_loan_status(x):
if x in ['Fully Paid', 'Does not meet the credit policy. Status:Fully Paid']:
return 'Good'
elif x in ['Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off', 'Late (16-30 days)', 'Late (31-120 days)']:
return 'Bad'
else:
return 'Pending'
df_loan['loan_outcome'] = df_loan['loan_status'].apply(catogorise_loan_status).astype('category')
df_loan['loan_outcome'].value_counts()
for col in num_vars2:
plot_box(df_loan, col, 'loan_outcome')
# df_loan.groupby('loan_outcome')[target_var].plot(kind='box')
Apparently, "Bad" loans tend to have higher dti, annual_inc, int_rate.
def plot_mulvar_scatter(df, x_col, y_col, cat, mask=None, no_outliers=True, **kwargs):
if mask is not None:
df = df[mask]
if no_outliers:
df = remove_outliers(df, x_col, 1)
df = remove_outliers(df, y_col, 1)
df = df.sample(50000)
g = sns.FacetGrid(df,
hue=cat,
size=10)
g.map(plt.scatter, x_col, y_col, **kwargs)
g.add_legend()
plt.show()
target_num_vars = [
'dti',
'total_rec_int',
'annual_inc',
'int_rate',
'loan_amnt',
'tot_cur_bal',
'mths_to_date',
'sub_grade_enc',
'recoveries',
]
# sample the data
mask = (df_loan.dti < 40) & (df_loan.annual_inc < 100000)
df = df_loan[mask].sample(50000)
sns.pairplot(df, hue='grade', vars=target_num_vars, diag_kind='kde')
plt.tight_layout()
plt.savefig('scatter_matrix.01.by.grade.png')
df2 = df_loan[mask & (df_loan.loan_outcome != 'Pending')].sample(50000)
df2['loan_outcome'] = df2.loan_outcome.cat.remove_unused_categories()
sns.pairplot(df2,
hue='loan_outcome',
vars=target_num_vars, diag_kind='kde')
plt.tight_layout()
plt.savefig('scatter_matrix.02.by.loan_outcome.png')
sns.pairplot(df2,
hue='term',
vars=target_num_vars, diag_kind='kde')
plt.tight_layout()
plt.savefig('scatter_matrix.03.by.term.png')
plot_mulvar_scatter(df2, 'int_rate', 'sub_grade_enc', 'term', no_outliers=False, alpha=1.0/5)